# Computations
import numpy as np
import pandas as pd
# sklearn
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier
# Visualisation libraries
## Progress Bar
import progressbar
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze and predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Path = 'telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
Data = pd.read_csv(Path.split(".")[0]+'_STD.csv')
df = Data.drop(columns = ['customer ID'])
Target = 'Churn'
Labels = ['No', 'Yes']
display(Data.head(6).style.hide_index().set_precision(2))
| customer ID | Gender | Senior Citizen | Partner | Dependents | Tenure | Phone Service | Multiple Lines | Internet Service | Online Security | Online Backup | Device Protection | Tech Support | Streaming TV | Streaming Movies | Contract | Paperless Billing | Monthly Charges | Total Charges | Churn | Bank transfer (automatic) | Credit card (automatic) | Electronic check | Mailed check |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7590-VHVEG | -1.01 | -0.44 | 1.03 | -0.65 | -1.28 | -3.05 | -0.85 | -0.29 | -0.10 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -1.16 | -0.99 | 0 | -0.53 | -0.53 | 1.41 | -0.54 |
| 5575-GNVDE | 0.99 | -0.44 | -0.97 | -0.65 | 0.07 | 0.33 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | -0.10 | -0.22 | -0.23 | 0.37 | -1.21 | -0.26 | -0.17 | 0 | -0.53 | -0.53 | -0.71 | 1.84 |
| 3668-QPYBK | 0.99 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | -0.29 | 1.32 | 1.18 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | -0.36 | -0.96 | 1 | -0.53 | -0.53 | -0.71 | 1.84 |
| 7795-CFOCW | 0.99 | -0.44 | -0.97 | -0.65 | 0.51 | -3.05 | -0.85 | -0.29 | 1.32 | -0.17 | 1.18 | 1.31 | -0.22 | -0.23 | 0.37 | -1.21 | -0.75 | -0.20 | 0 | 1.89 | -0.53 | -0.71 | -0.54 |
| 9237-HQITU | -1.01 | -0.44 | -0.97 | -0.65 | -1.24 | 0.33 | -0.85 | 1.00 | -0.10 | -0.17 | -0.17 | -0.10 | -0.22 | -0.23 | -0.83 | 0.83 | 0.20 | -0.94 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
| 9305-CDSKC | -1.01 | -0.44 | -0.97 | -0.65 | -0.99 | 0.33 | 1.17 | 1.00 | -0.10 | -0.17 | 1.18 | -0.10 | 1.10 | 1.09 | -0.83 | 0.83 | 1.16 | -0.65 | 1 | -0.53 | -0.53 | 1.41 | -0.54 |
First, consider the data distribution for Term Deposit Subscription.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Dist_Table(Inp, Target = Target):
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(dict(zip([0,1],Labels)))
Table['Percentage'] = 100 - np.round(100*(Table['Count']/Table['Count'].sum()),2)
return Table
def Dist_Plot(Table, PieColors = ['FireBrick', 'SeaGreen'], TableColors = ['Navy','White']):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values, pull=[0, 0.1], textfont=dict(size=16),
marker=dict(colors = PieColors, line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"), legend_title_text= Target)
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.2, 0.2],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + 'Distribution' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Table = Dist_Table(Data)
Dist_Plot(Table, PieColors = ['SeaGreen', 'Tomato'], TableColors = ['DarkGreen','White'])
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
X = df.drop(columns = Target).values
y = df[Target].values
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Plot(y_train = y_train, y_test = y_test, Colors = ['FireBrick', 'SeaGreen']):
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}]*2])
_, Temp = np.unique(y_train, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values= Temp,
pull=[0, 0.1],
name= 'Train Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
_, Temp = np.unique(y_test, return_counts=True)
fig.add_trace(go.Pie(labels=Labels,
values=Temp,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Train_Test_Plot(Colors = ['SeaGreen', 'Tomato'])
Decision Tree Classifier uses a decision tree (as a predictive model) to go from observations about an item (represented in the branches) to conclusions about the item's target value (represented in the leaves).
def Grid_Table(grid):
Temp = [str(x) for x in grid.cv_results_['params']]
Temp = [s.replace('{', '').replace('}', '').replace("'", '') for s in Temp]
Table = pd.DataFrame({'rank_test_score': grid.cv_results_['rank_test_score'],
'params':Temp,
'mean_test_score': grid.cv_results_['mean_test_score'],
'mean_fit_time': grid.cv_results_['mean_fit_time']})
Table = Table.round(4).sort_values('rank_test_score').set_index('rank_test_score')
return Table
def Grid_Performance_Plot(Table):
font = FontProperties()
font.set_weight('bold')
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
Z = zip(axes, ['mean_test_score', 'mean_fit_time'], ['Blue', 'Red'],['Classification Accuracy', 'Fit Time (with caching)'])
for ax, col, c, title in Z:
_ = ax.errorbar(x = Table['params'], y = Table[col], yerr = Table[col], color = c)
_ = ax.set_xticklabels(labels = Table['params'],rotation=90, fontsize = 10)
_ = ax.set_ylim(bottom = 0)
_ = ax.set_xlabel('Paramerers')
_ = ax.set_title(title, fontproperties=font, fontsize = 14)
grid_dtc = RandomizedSearchCV(DecisionTreeClassifier(),
{'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)},
cv = KFold(n_splits = X.shape[1], shuffle = True),
n_iter = 30,
scoring = 'roc_auc',
error_score = 0,
verbose = 3,
n_jobs = 10,
refit = True)
_ = grid_dtc.fit(X_train, y_train)
clear_output()
display(pd.DataFrame({'Best Score': [grid_dtc.best_score_], 'Best Paramerers': [str(grid_dtc.best_params_)],
'Accuracy': [grid_dtc.score(X_test,y_test)]}).round(4).style.hide_index().set_precision(4))
Table = Grid_Table(grid_dtc)
display(Table.reset_index(drop = False).head(10).style.hide_index().\
set_precision(4).background_gradient(subset= ['mean_test_score'], cmap='Greens').\
background_gradient(subset= ['mean_fit_time'], cmap='Oranges'))
Grid_Performance_Plot(Table)
| Best Score | Best Paramerers | Accuracy |
|---|---|---|
| 0.8348 | {'max_depth': 5, 'criterion': 'gini'} | 0.8299 |
| rank_test_score | params | mean_test_score | mean_fit_time |
|---|---|---|---|
| 1 | max_depth: 5, criterion: gini | 0.8348 | 0.0093 |
| 2 | max_depth: 4, criterion: gini | 0.8315 | 0.0081 |
| 3 | max_depth: 5, criterion: entropy | 0.8299 | 0.0115 |
| 4 | max_depth: 4, criterion: entropy | 0.8269 | 0.0100 |
| 5 | max_depth: 6, criterion: gini | 0.8242 | 0.0107 |
| 6 | max_depth: 6, criterion: entropy | 0.8218 | 0.0125 |
| 7 | max_depth: 3, criterion: gini | 0.8206 | 0.0066 |
| 8 | max_depth: 3, criterion: entropy | 0.8147 | 0.0076 |
| 9 | max_depth: 7, criterion: gini | 0.8115 | 0.0121 |
| 10 | max_depth: 7, criterion: entropy | 0.8103 | 0.0144 |
Now, we can develop a model using the best parameters for the DTC. Here, we use 10 stratified randomized folds made by preserving the percentage of samples for each class.
n_splits = 10
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# Train the model with the best parameters
dtc = DecisionTreeClassifier(**grid_dtc.best_params_)
_ = dtc.fit(X_train,y_train)
# Train
y_pred = dtc.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = dtc.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train.index.name = 'Train Set (CV = % i)' % n_splits
Reports_Test.index.name = 'Test Set (CV = % i)' % n_splits
display(Reports_Train)
display(Reports_Test)
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| Test Set (CV = 10) | ||||
| No | 0.8561 ± 0.0168 | 0.8779 ± 0.0226 | 0.8664 ± 0.0024 | 3622.0000 ± 0.0000 |
| Yes | 0.6379 ± 0.0197 | 0.5890 ± 0.0690 | 0.6090 ± 0.0329 | 1308.0000 ± 0.0000 |
| accuracy | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 | 0.8012 ± 0.0027 |
| macro avg | 0.7470 ± 0.0032 | 0.7334 ± 0.0233 | 0.7377 ± 0.0156 | 4930.0000 ± 0.0000 |
| weighted avg | 0.7983 ± 0.0076 | 0.8012 ± 0.0027 | 0.7981 ± 0.0076 | 4930.0000 ± 0.0000 |
| precision | recall | f1-score | support | |
|---|---|---|---|---|
| Test Set (CV = 10) | ||||
| No | 0.8504 ± 0.0149 | 0.8654 ± 0.0267 | 0.8574 ± 0.0071 | 1552.0000 ± 0.0000 |
| Yes | 0.6111 ± 0.0275 | 0.5766 ± 0.0640 | 0.5899 ± 0.0296 | 561.0000 ± 0.0000 |
| accuracy | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 | 0.7887 ± 0.0076 |
| macro avg | 0.7308 ± 0.0099 | 0.7210 ± 0.0203 | 0.7237 ± 0.0141 | 2113.0000 ± 0.0000 |
| weighted avg | 0.7869 ± 0.0076 | 0.7887 ± 0.0076 | 0.7864 ± 0.0080 | 2113.0000 ± 0.0000 |
Similarly, for Confusion Matrix, we take the average of the confusion matrices of all folds.
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
Note that:
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
However, the accuracy can be a misleading metric for imbalanced data sets. Here, over 88 percent of the sample has negative (No) and about 12 percent has positive (Yes) values. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two:
\begin{align} \text{TPR} &= \frac{T_p}{T_p + F_n},\\ \text{TNR} &= \frac{T_N}{T_p + F_p},\\ \text{Balanced Accuracy (bACC)} &= \frac{1}{2}\left(\text{TPR}+\text{TNR}\right) \end{align}Header('Train Set')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Train Set ========================================================================================== Precision (Train) = 0.64 Recall (Train) = 0.59 TPR (Train) = 0.59 TNR (Train) = 0.88 Balanced Accuracy (Train) = 0.73 Test Set =========================================================================================== Precision (Test) = 0.61 Recall (Test) = 0.58 TPR (Test) = 0.58 TNR (Test) = 0.87 Balanced Accuracy (Test) = 0.72 ====================================================================================================
To improve the results, we can implement an iterative optimization deep learning method.